cont <- params$controlCellLine
exp <- params$experimentalCellLine
countmatrix.all <- params$countmatrix.all
metadata.all <- params$metadata.all
rm(params) # Remove the parameters so that we can make subsequent parameterized calls
metadata.pair <- as.data.frame(metadata.all) %>%
filter(CellLine == cont | CellLine == exp)
as.data.frame(metadata.pair)
countmatrix.pair <- countmatrix.all[, metadata.pair$ShortName]
as.data.frame(countmatrix.pair)
Run Deseq on the data set.
# Saving time by just loading the dds we already ran (recent changes are all after this point)
load(str_interp("Rdata/${exp}_vs_${cont}_dds.RData"))
res <- results(dds.pair, contrast = c("CellLine", exp, cont), alpha = 0.05)
res <- res[order(res$log2FoldChange), ]
outFile <- str_interp("output/${exp}_vs_${cont}_deseq_results.csv")
write.csv(as.data.frame(res), file = outFile)
Filter res for padj < 0.05
res.filtered <- as.data.frame(res) %>%
filter(padj < 0.05)
# filter(log2FoldChange >= 1.5 | log2FoldChange <= -1.5)
res.filtered <- res.filtered[order(res.filtered$log2FoldChange, decreasing = TRUE),]
res.filtered
up.unfiltered <- subset(res, log2FoldChange > 0)
up.unfiltered <- up.unfiltered[order(up.unfiltered$log2FoldChange, decreasing = TRUE), ]
outFile <- str_interp("output/${exp}_vs_${cont}_all_upregulated_genes.csv")
write.csv(up.unfiltered[, c("log2FoldChange", "padj")], file = outFile)
up.unfiltered[, c("log2FoldChange", "padj")]
log2 fold change (MLE): CellLine OVCAR3A vs OVCAR3
DataFrame with 10538 rows and 2 columns
log2FoldChange padj
<numeric> <numeric>
HS3ST3B1 11.7378 1.89766e-21
GCGR 11.0064 2.35241e-18
SLX1A 10.9782 1.04130e-18
SLIT3 10.6499 1.76669e-23
SLC1A6 10.6431 1.01154e-17
... ... ...
PDHA1 0.000695976 0.997967
LOC101928489 0.000387853 0.999674
ARIH1 0.000262352 0.999039
ALAD 0.000131583 0.999511
FLVCR2 0.000118652 0.999703
up <- subset(res.filtered, log2FoldChange > 0)
up <- up[order(up$log2FoldChange, decreasing = TRUE), ]
outFile <- str_interp("output/${exp}_vs_${cont}_significantly_upregulated_genes.csv")
write.csv(up[, c("log2FoldChange", "padj")], file = outFile)
print(up[, c("log2FoldChange", "padj")])
down.unfiltered <- subset(res, log2FoldChange < 0)
down.unfiltered <- down.unfiltered[order(down.unfiltered$log2FoldChange, decreasing = FALSE), ]
outFile <- str_interp("output/${exp}_vs_${cont}_all_downregulated_genes.csv")
write.csv(down.unfiltered[, c("log2FoldChange", "padj")], file = outFile)
print(down.unfiltered[, c("log2FoldChange", "padj")])
log2 fold change (MLE): CellLine OVCAR3A vs OVCAR3
DataFrame with 11374 rows and 2 columns
log2FoldChange padj
<numeric> <numeric>
LOC102724852 -12.7565 2.29371e-25
CLDN6 -12.6081 3.73717e-32
AR -12.5227 4.09209e-24
SSX2 -11.6997 4.25276e-21
CTSF -11.5097 2.83502e-20
... ... ...
WASH2P -0.000434051 0.999037
RAB24 -0.000429655 0.998922
FHL1 -0.000386814 0.999037
SRRM5 -0.000309194 0.999699
CASP9 -0.000106682 0.999648
down <- subset(res.filtered, log2FoldChange < 0)
down <- down[order(down$log2FoldChange, decreasing = TRUE), ]
outFile <- str_interp("output/${exp}_vs_${cont}_significantly_downregulated_genes.csv")
write.csv(down[, c("log2FoldChange", "padj")], file = outFile)
print(down[, c("log2FoldChange", "padj")])
as.data.frame(res) %>%
ggplot(aes(x = log2FoldChange, y = -log10(padj), label = rownames(res))) +
geom_point() +
theme_minimal() +
scale_color_manual(values = c("black", "blue", "red")) +
geom_text_repel() +
geom_hline(yintercept = 1.301) +
geom_vline(xintercept = 1.2) +
geom_vline(xintercept = -1.2) +
xlim(-10, 10)
Warning: Removed 12017 rows containing missing values (`geom_point()`).
Warning: Removed 12017 rows containing missing values (`geom_text_repel()`).
Warning: ggrepel: 17693 unlabeled data points (too many overlaps). Consider increasing
max.overlaps
Perform gene set enrichment analysis using Cluster Profiler. This gives us GO pathways that are significantly regulated based on the log2fold change of expression of individual genes.
Using a pvalue Cutoff of 0.05
gene_list <- res$log2FoldChange
names(gene_list) <- rownames(res)
gene_list <- sort(gene_list, decreasing = TRUE)
# Set the seed so our results are reproducible:
set.seed(2023)
gsea_res <- gseGO(gene_list, ont = "BP", OrgDb = "org.Hs.eg.db", keyType = "SYMBOL", seed = TRUE, pvalueCutoff = 0.05)
preparing geneSet collections...
GSEA analysis...
Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (13.28% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.
leading edge analysis...
done...
# Format output
gsea_res_df <- as.data.frame(gsea_res)
gsea_res_df <- gsea_res_df %>%
mutate(original_row_num = row_number())
gsea_res_df <- gsea_res_df[order(gsea_res_df$NES, decreasing = TRUE),]
row.names(gsea_res_df) <- gsea_res_df$ID
NES is the normalized enrichment score.
gsea_res_df_short <- gsea_res_df[c("pvalue", "p.adjust", "NES", "Description")]
gsea_res_df_short$"core_enrichment_genes" <- gsea_res_df$core_enrichment
gsea_res_df_short.up <- subset(gsea_res_df_short, gsea_res_df_short$NES >= 0)
outFile <- str_interp("output/${exp}_vs_${cont}_significantly_upregulated_pathways.csv")
write.csv(gsea_res_df_short.up, file = outFile)
gsea_res_df_short.up
GSEA plot of the five most upregulated pathways (or least downregulated)
maxIndex <- min(5, nrow(gsea_res_df)) # Prevents us from trying to access out of bounds if there are not five pathways
top5PathwaysIds = gsea_res_df[1:maxIndex, "original_row_num"]
gseaplot2(gsea_res, geneSetID = top5PathwaysIds, pvalue_table = FALSE, ES_geom = "dot")
Volcano Plot (Average NES & adjusted p value)
as.data.frame(gsea_res_df_short.up) %>%
ggplot(aes(x = NES, y = -log10(p.adjust), label = rownames(gsea_res_df_short.up))) +
geom_point() +
theme_minimal() +
scale_color_manual(values = c("black", "blue", "red")) +
geom_text_repel() +
geom_hline(yintercept = 1.301) +
geom_vline(xintercept = 1.2) +
geom_vline(xintercept = -1.2) +
xlim(-10, 10)
gsea_res_df_short.down <- subset(gsea_res_df_short, gsea_res_df_short$NES <= 0)
outFile <- str_interp("output/${exp}_vs_${cont}_significantly_downregulated_pathways.csv")
write.csv(gsea_res_df_short.down, file = outFile)
gsea_res_df_short.down
GSEA plot of the five most downregulated pathways (or least upregulated)
minIndex <- max(1, nrow(gsea_res_df) - 5) # Prevents us from trying to access out of bounds if there are not five downregulated pathways
bottom5PathwaysIds = gsea_res_df[minIndex:nrow(gsea_res_df), "original_row_num"]
gseaplot2(gsea_res, geneSetID = bottom5PathwaysIds, pvalue_table = FALSE, ES_geom = "dot")
Volcano plot (Average NES & adjusted p value)
as.data.frame(gsea_res_df_short.down) %>%
ggplot(aes(x = NES, y = -log10(p.adjust), label = rownames(gsea_res_df_short.down))) +
geom_point() +
theme_minimal() +
scale_color_manual(values = c("black", "blue", "red")) +
geom_text_repel() +
geom_hline(yintercept = 1.301) +
geom_vline(xintercept = 1.2) +
geom_vline(xintercept = -1.2) +
xlim(-10, 10)
Use Revigo to cluster upregulated pathways
revigo_input.cellline.up <- gsea_res_df_short.up[c("p.adjust")]
rownames(revigo_input.cellline.up) <- rownames(gsea_res_df_short.up)
simMatrix <- calculateSimMatrix(rownames(revigo_input.cellline.up),
orgdb = "org.Hs.eg.db",
ont = "BP",
method = "Rel"
)
preparing gene to GO mapping data...
preparing IC data...
scores <- setNames(-log10(revigo_input.cellline.up$p.adjust), rownames(revigo_input.cellline.up))
if (nrow(revigo_input.cellline.up) > 1) {
reducedTerms <- reduceSimMatrix(simMatrix,
scores,
threshold = 0.7,
orgdb = "org.Hs.eg.db"
)
} else {
reducedTerms <- data.frame(matrix(ncol = 0, nrow = 0))
print("There will be no graphs appearing below this because there were not enough significantly upregulated pathways to meaningfully cluster them")
}
Revigo interactive scatter plot. Distances represent the similarity between terms, axes are the first 2 components of a PCA plot, Each bubble indicates the representative (chosen mostly by p-value) from a cluster of terms. Size of the bubble indicates the generality of the term (large meaning a more general term).
if (nrow(reducedTerms) > 2) {
revigo_scatterplot(simMatrix, reducedTerms)
}
Revigo heatmap plot. Similar terms clustered
if (nrow(reducedTerms) > 2) {
heatmapPlot(simMatrix,
reducedTerms,
annotateParent = TRUE,
annotationLabel = "parentTerm",
fontsize = 6
)
}
This is the same content, but interactive.
if (nrow(reducedTerms) > 2) {
revigo_heatmap(simMatrix, reducedTerms)
}
Warning: Specifying width/height in layout() is now deprecated.
Please specify in ggplotly() or plot_ly()
Revigo treemap plot. Terms grouped/colored based on parent. Space is proportional to statistical significance of the GO term (-log10(pvalue)).
if (nrow(reducedTerms) > 2) {
treemapPlot(reducedTerms)
}
Use Revigo to cluster downregulated pathways
revigo_input.cellline.down <- gsea_res_df_short.down[c("p.adjust")]
rownames(revigo_input.cellline.down) <- rownames(gsea_res_df_short.down)
simMatrix <- calculateSimMatrix(rownames(revigo_input.cellline.down),
orgdb = "org.Hs.eg.db",
ont = "BP",
method = "Rel"
)
preparing gene to GO mapping data...
preparing IC data...
scores <- setNames(-log10(revigo_input.cellline.down$p.adjust), rownames(revigo_input.cellline.down))
if (nrow(revigo_input.cellline.down) > 1) {
reducedTerms <- reduceSimMatrix(simMatrix,
scores,
threshold = 0.7,
orgdb = "org.Hs.eg.db"
)
} else {
reducedTerms <- data.frame(matrix(ncol = 0, nrow = 0))
print("There will be no graphs appearing below this because there were not enough significantly downregulated pathways to meaningfully cluster them")
}
Revigo interactive scatter plot. Distances represent the similarity between terms, axes are the first 2 components of a PCA plot, Each bubble indicates the representative (chosen mostly by p-value) from a cluster of terms. Size of the bubble indicates the generality of the term (large meaning a more general term).
if (nrow(reducedTerms) > 2) {
revigo_scatterplot(simMatrix, reducedTerms)
}
Revigo heatmap plot. Similar terms clustered
if (nrow(reducedTerms) > 2) {
heatmapPlot(simMatrix,
reducedTerms,
annotateParent = TRUE,
annotationLabel = "parentTerm",
fontsize = 6
)
}
This is the same content, but interactive.
if (nrow(reducedTerms) > 2) {
revigo_heatmap(simMatrix, reducedTerms)
}
Warning: Specifying width/height in layout() is now deprecated.
Please specify in ggplotly() or plot_ly()
Revigo treemap plot. Terms grouped/colored based on parent. Space is proportional to statistical significance of the GO term (-log10(pvalue)).
if (nrow(reducedTerms) > 2) {
treemapPlot(reducedTerms)
}